home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Skunkware 5
/
Skunkware 5.iso
/
src
/
X11
/
wais
/
ir
/
irretrvl.c
< prev
next >
Wrap
C/C++ Source or Header
|
1995-05-09
|
16KB
|
559 lines
/* WIDE AREA INFORMATION SERVER SOFTWARE:
No guarantees or restrictions. See the readme file for the full standard
disclaimer.
*/
#ifndef lint
static char *RCSid = "$Header: /tmp_mnt/net/quake/proj/wais/wais-8-b5/ir/RCS/irretrvl.c,v 1.30 92/05/10 14:43:59 jonathan Exp $";
#endif
/* Change log:
* $Log: irretrvl.c,v $
* Revision 1.30 92/05/10 14:43:59 jonathan
*
* Made a little safer on NULL docid's when parsing.
*
* Revision 1.29 92/05/06 17:31:26 jonathan
* modified #if's for NeXT and Mach. Added S_ISDIR definition for them both.
*
* Revision 1.28 92/05/04 17:19:54 jonathan
* Added test for parsing docids (if null, log error).
*
* Revision 1.27 92/04/28 16:56:08 morris
* added boolean to serial engine
*
* Revision 1.26 92/04/01 17:09:46 jonathan
* Added index_directory to check_for_legitimate_file to test if filename is
* under default directory (for FTP-like retrieval).
*
*
* Revision 1.25 92/03/18 08:54:41 jonathan
* Removed databaseName argument from getData and getDocumentText. The
* database name is now culled from the docid. Removed special cases for INFO
* and Quest db's, as they should no longer be needed.
*
* Revision 1.24 92/02/18 14:04:49 jonathan
* in check_for_legitimate_file: added INFO to the list of special case
* retrievals from MAC's.
*
* Revision 1.23 92/02/18 11:53:45 jonathan
* conditionalized use of tempnam for NeXT (doesn't exist, use tmpnam
* instead). May be a BSD thing.
*
* Revision 1.22 92/02/17 12:38:52 jonathan
* special case catalog in check_for_legitimate_file.
*
* Revision 1.21 92/02/16 18:04:52 jonathan
* Demoted more WLOG_ERROR's to WLOG_WARNING's
*
* Revision 1.20 92/02/15 19:40:30 jonathan
* Improved reporting of retrieval errors.
*
* Revision 1.19 92/02/15 18:58:38 jonathan
* Changed most (but not all) waislog errors to warnings on retrieval.
*
* Revision 1.18 92/02/14 16:06:20 jonathan
* Fixed text in error message for invalid docid (not in DB)
*
* Revision 1.17 92/02/14 15:24:08 jonathan
* Made parseDocID public.
*
* Revision 1.16 92/02/12 13:29:35 jonathan
* Added "$Log" so RCS will put the log message in the header
*
*/
/* retrieval part of the serial ir engine. if you are using a different
storage system for the documents, replace this file.
-brewster
10/91 added .Z file support from mlm@cs.brown.edu (Moises Lejter)
to do:
handle .Z files at a lower level.
*/
#include "irretrvl.h"
#include "irfiles.h" /* for filename_table_ext */
#include <string.h>
#include "futil.h"
#include <ctype.h> /* for isspace */
#include "irext.h"
#include "irdirent.h"
#include <sys/stat.h>
#ifdef Mach
#include <sys/inode.h>
#define S_ISDIR(f_mode) (f_mode & IFDIR)
#endif /* Mach */
#if (defined(NeXT) && !(defined(S_ISDIR)))
#define S_ISDIR(f_mode) ((fmode) & S_IFDIR)
#endif
/*----------------------------------------------------------------------*/
boolean
parseDocID(doc,filename,start_character,end_character,errorCode)
DocObj* doc;
char* filename;
long* start_character;
long* end_character;
long* errorCode;
{
DocID* theDocID = NULL;
char* local_id = NULL;
char* token = NULL;
long i;
if((theDocID = docIDFromAny(doc->DocumentID)) == NULL)
return false;
local_id = anyToString(GetLocalID(theDocID));
freeDocID(theDocID);
/* parse the doc id into start pos, end pos, and filename */
/* first the start char */
token = local_id;
for (i = 0; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
;
if (local_id[i] == '\0')
{
waislog(WLOG_HIGH, WLOG_WARNING,
"Attempt to retrieve data for bad doc-id: '%s'",local_id);
*errorCode = GDT_BadDocID;
s_free(local_id);
return(false);
}
local_id[i] = '\0';
sscanf(token,"%ld",start_character);
/* now the second char */
token = local_id + i + 1;
for (++i; local_id[i] != '\0' && isspace(local_id[i]) == false; i++)
;
if (local_id[i] == '\0')
{
waislog(WLOG_HIGH, WLOG_WARNING,
"Attempt to retrieve data for bad doc-id: '%s'",local_id);
*errorCode = GDT_BadDocID;
s_free(local_id);
return(false);
}
local_id[i] = '\0';
sscanf(token,"%ld",end_character);
/* and finally the file name */
strncpy(filename,local_id + i + 1,MAX_FILENAME_LEN);
s_free(local_id);
return(true);
}
/*----------------------------------------------------------------------*/
/* this checks to make sure that the filename is a file
within the database */
static boolean check_for_legitimate_file
_AP((char *filename, char* database_name, char* index_directory));
static boolean check_for_legitimate_file(filename, database_name, index_directory)
char *filename;
char *database_name; /* full pathname of the database */
char *index_directory;
{
struct stat sbuf;
/* the help file and catalog file (the .src and .cat files) must be
special cased because it is not in the filename table */
/* caching is done in filename_in_filename_file for repeated requests
for the same file, so it does not need to be repeated here. */
if(NULL != strstr(filename, ".src")) /* let it pass */
return(true);
if(NULL != strstr(filename, ".cat")) /* let it pass */
return(true);
stat(filename, &sbuf);
if(S_ISDIR(sbuf.st_mode)) {
waislog(WLOG_HIGH, WLOG_WARNING,
"File: '%s' is a directory, and cannot be retrieved.",
filename);
return(false);
}
else {
/* name of the file of the filetable for this db (eg /bar/foo.fn). confusing, no? */
char filename_table_filename[MAX_FILE_NAME_LEN +1];
pathname_directory(database_name, filename_table_filename);
strncat(filename_table_filename, "/", MAX_FILE_NAME_LEN);
strncat(filename_table_filename,
database_file(pathname_name(database_name)),
MAX_FILE_NAME_LEN);
s_strncat(filename_table_filename, filename_table_ext, MAX_FILE_NAME_LEN,
MAX_FILE_NAME_LEN);
if(!filename_in_filename_file(filename, NULL, NULL, filename_table_filename)){
/* we lose. this means either the db does not exist, or
the file is not in that db. Log the bad news */
if(index_directory == NULL)
return true;
else if (substrcmp(filename, index_directory))
return true;
waislog(WLOG_HIGH, WLOG_WARNING,
"File: '%s' is not in DB '%s', and cannot be retrieved.",
filename, filename_table_filename);
return(false);
}
else{ /* everything is peachy */
return(true);
}
}
}
/*----------------------------------------------------------------------*/
WAISDocumentText* getData(doc, errorCode, index_directory)
DocObj* doc;
long* errorCode;
char* index_directory;
/* it isn't text, so we can just grab data */
{
FILE* file = NULL;
char fileName[MAX_FILENAME_LEN + 1];
char* dbname = NULL;
WAISDocumentText* data = NULL;
long start,end; /* position of the document in the file */
long startByte,endByte,bytes,bytesRead; /* part of the doc that we want */
char* buffer = NULL;
any* bufAny = NULL;
DocID *docid;
#if (defined(NeXT) || defined(Mach))
char tmpFileName[MAX_FILENAME_LEN+1];
#else
char *tmpFileName = NULL;
#endif /* NeXT or Mach */
/* we can only handle byte chunks here */
if ((doc->ChunkCode == CT_byte) ||
(doc->ChunkCode == CT_document)) {
if (parseDocID(doc,fileName,&start,&end,errorCode) == false)
{
waislog(WLOG_HIGH, WLOG_WARNING, "can't parse docid");
*errorCode = GDT_MissingDocID;
return(NULL);
}
*errorCode = GDT_NoError;
docid = docIDFromAny(doc->DocumentID);
dbname = anyToString(GetDatabase(docid));
freeDocID(docid);
if(true == check_for_legitimate_file(fileName, dbname, index_directory)){
file = s_fopen(fileName,"rb");
if (file == NULL){
if(probe_file_possibly_compressed(fileName)) {
char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
#if (defined(NeXT) || defined(Mach))
tmpnam(tmpFileName);
#else
tmpFileName = tempnam( "/tmp/", 0 );
#endif /* NeXT or Mach */
sprintf( buffer, "zcat %s.Z > %s", fileName, tmpFileName );
system( buffer );
file = s_fopen(tmpFileName,"rb");
}
}
}
if (file == NULL) {
waislog(WLOG_HIGH, WLOG_WARNING,
"Attempt to retrieve data for missing doc-id: '%s'",
fileName);
*errorCode = GDT_MissingDocID;
s_free(dbname);
return(NULL);
}
if (doc->ChunkCode == CT_byte) {
startByte = doc->ChunkStart.Pos + start;
endByte = doc->ChunkEnd.Pos + start;
}
else {
startByte = start;
endByte = end;
}
waislog(WLOG_LOW, WLOG_RETRIEVE,
"Retrieving DocID: %d %d %s, byte: %d %d, from database %s",
start, end, fileName, startByte, endByte, dbname);
s_free(dbname);
if (endByte > end && end != 0) {
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval beyond bounds of document %ld in file <%s>",
endByte,fileName);
*errorCode = GDT_BadRange;
endByte = end;
}
/* get the bytes */
if (fseek(file,startByte,SEEK_SET) != 0)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval can't seek to %ld in file <%s>",startByte,
fileName);
*errorCode = GDT_BadRange;
if (tmpFileName) unlink( tmpFileName );
if (tmpFileName) unlink( tmpFileName );
if (tmpFileName) unlink( tmpFileName );
return(NULL);
}
bytes = endByte - startByte;
buffer = (char*)s_malloc(bytes);
bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
if (bytesRead != bytes)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval error in file <%s>",fileName);
*errorCode = GDT_BadRange;
if (bytesRead == 0)
return(NULL);
}
bufAny = makeAny(bytesRead,buffer);
data = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
/* the any and the buffer are freed by freeWAISSearchResponse() */
s_fclose(file);
if (tmpFileName) unlink( tmpFileName );
return(data);
}
else
{
waislog(WLOG_HIGH, WLOG_WARNING,
"search engine can only use whole documents or byte offsets for data lookup");
*errorCode = GDT_UnsupportedChunkType;
return(NULL);
}
}
/*----------------------------------------------------------------------*/
#define BUFSZ (size_t)5000
WAISDocumentText* getDocumentText(doc, errorCode, index_directory)
DocObj* doc;
long* errorCode;
char* index_directory;
/* find the text for doc, get the sub part if any, finally construct and
return a WAISDocumentText. If it can not find the document
(or some other error) it returns NULL and sets errorCode.
*/
{
WAISDocumentText* text = NULL;
FILE* file = NULL;
char* dbname = NULL;
char* buffer = NULL;
any* bufAny = NULL;
char filename[MAX_FILENAME_LEN + 1];
long start_character;
long end_character;
register long i;
long bytes,bytesRead;
long startByte,endByte,byte,lines;
#if (defined(NeXT) || defined(Mach))
char tmpFileName[MAX_FILENAME_LEN+1];
#else
char *tmpFileName = NULL;
#endif /* NeXT or Mach */
DocID* theDocID = NULL;
char* local_id = NULL;
*errorCode = GDT_NoError;
/* we can only handle line chunks for now */
if (doc->ChunkCode != CT_line)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"search engine can only use line offsets for now.");
*errorCode = GDT_UnsupportedChunkType;
return(NULL);
}
theDocID = docIDFromAny(doc->DocumentID);
dbname = anyToString(GetDatabase(theDocID));
local_id = anyToString(GetLocalID(theDocID));
freeDocID(theDocID);
if (parseDocID(doc,filename,&start_character,&end_character,errorCode) ==
false) {
waislog(WLOG_HIGH, WLOG_ERROR,
"Can't parse doc-id: '%s'", local_id);
*errorCode = GDT_MissingDocID;
s_free(dbname);
s_free(local_id);
return(NULL);
}
waislog(WLOG_LOW, WLOG_RETRIEVE,
"Retrieving DocID: '%s', line range: %d %d, from database %s",
local_id, doc->ChunkStart.Pos, doc->ChunkEnd.Pos,
dbname);
/* check the database */
if(NULL == dbname){
waislog(WLOG_HIGH, WLOG_WARNING,
"Missing database for doc-id: '%s'", local_id);
*errorCode = GDT_MissingDatabase;
s_free(local_id);
return(NULL);
}
if(check_for_legitimate_file(filename, dbname, index_directory) == false){
waislog(WLOG_HIGH, WLOG_WARNING,
"doc-id: '%s' not in database '%s'", local_id,dbname);
*errorCode = GDT_MissingDocID;
s_free(dbname);
s_free(local_id);
return(NULL);
}
s_free(dbname);
file = s_fopen(filename,"r");
if (file == NULL)
if(probe_file_possibly_compressed(filename)) {
char buffer[ 2 * MAX_FILENAME_LEN + 10 ];
#if (defined(NeXT) || defined(Mach))
tmpnam(tmpFileName);
#else
tmpFileName = tempnam( "/tmp/", 0 );
#endif /* NeXT or Mach */
sprintf( buffer, "zcat %s.Z > %s", filename, tmpFileName );
system( buffer );
file = s_fopen(tmpFileName,"r");
}
if (file == NULL) {
waislog(WLOG_HIGH, WLOG_WARNING,
"Attempt to retrieve text for bad doc-id: '%s'", local_id);
*errorCode = GDT_MissingDocID;
s_free(local_id);
return(NULL);
}
if(0 != fseek(file, start_character, SEEK_SET))
{
waislog(WLOG_HIGH, WLOG_WARNING,
" error on attempt to seek into file for doc-id: '%s'", local_id);
s_free(local_id);
*errorCode = GDT_BadRange;
return(NULL);
}
/* find the start byte */
buffer = (char*)s_malloc(BUFSZ);
lines = byte = 0;
while (lines < doc->ChunkStart.Pos)
{ /* search a buffer full */
bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
for (i = 0; i < bytesRead && lines < doc->ChunkStart.Pos; i++, byte++)
{ if (buffer[i] == '\n' || buffer[i] == '\r')
/* \r should not happen because we are reading the file in text
mode */
lines++;
}
if (bytesRead == 0) /* cheasy handling files that don't end with nl */
lines++;
}
startByte = byte;
beFriendly();
/* find the end byte */ /* this could be done while getting the bytes XXX */
/* search starting form the start pos */
if (fseek(file,startByte + start_character,SEEK_SET) != 0)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval can't seek to %ld in file <%s>",
startByte,filename);
*errorCode = GDT_BadRange;
if (tmpFileName) unlink( tmpFileName );
s_free(local_id);
return(NULL);
}
beFriendly();
while (lines < doc->ChunkEnd.Pos)
{ /* search a buffer full */
bytesRead = fread(buffer,(size_t)sizeof(char),BUFSZ,file);
for (i = 0; i < bytesRead && lines < doc->ChunkEnd.Pos; i++, byte++)
{ if (buffer[i] == '\n' || buffer[i] == '\r')
/* \r should not happen, we are reading the file in text mode */
lines++;
}
if (bytesRead == 0) /* cheasy handling of files that don't end with nl */
lines++;
}
endByte = byte;
beFriendly();
s_free(buffer);
/* get the bytes */
if (fseek(file,startByte + start_character,SEEK_SET) != 0)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval can't seek to %ld in file <%s>",startByte,
filename);
*errorCode = GDT_BadRange;
if (tmpFileName) unlink( tmpFileName );
s_free(local_id);
return(NULL);
}
bytes = endByte - startByte;
buffer = (char*)s_malloc(bytes);
bytesRead = fread((void*)buffer,(size_t)sizeof(char),bytes,file);
if (bytesRead != bytes)
{
waislog(WLOG_HIGH, WLOG_WARNING,
"retrieval error in file <%s>",filename);
*errorCode = GDT_BadRange;
if (tmpFileName) unlink( tmpFileName );
s_free(local_id);
return(NULL);
}
bufAny = makeAny(bytesRead,buffer);
text = makeWAISDocumentText(duplicateAny(doc->DocumentID),0L,bufAny);
/* the any and the buffer are freed by freeWAISSearchResponse() */
s_fclose(file);
if (tmpFileName) unlink( tmpFileName );
*errorCode = GDT_NoError;
s_free(local_id);
return(text);
}